#we upload the dataset
total_500 <- read.csv("~/GitHub/thesis_msc_business_analytics/Python/total_500_new.csv", sep=";", na.strings="n/a")
#we see how many observations and how many variables we have
dim(total_500)
## [1] 500 730
#We create a subset to make some changes to the data
total_500_sub <- total_500
#Change the decimal point for the 4 variables
total_500_sub$Assets.. <- gsub(",", ".", total_500_sub$Assets.. )
total_500_sub$Market.value.. <- gsub(",", ".", total_500_sub$Market.value.. )
total_500_sub$Revenues.. <- gsub(",", ".", total_500_sub$Revenues.. )
total_500_sub$Total.Stockholder.Equity.. <- gsub(",", ".", total_500_sub$Total.Stockholder.Equity.. )
#Make the variables numeric
for(i in 1:18){
 total_500_sub[,i] <- as.numeric(total_500_sub[,i])}  
## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion
for(i in 20:730){
 total_500_sub[,i] <- as.numeric(total_500_sub[,i])} 
#We omit the nas from the analysis
total_500_final <- na.omit(total_500_sub)
#We rename variable X as Ranking
colnames(total_500_final)[1] <- "Ranking"
#Change the names of some variables to be more easily readable
colnames(total_500_final)[2] <- "Assets"
colnames(total_500_final)[3] <- "Market_Value"
colnames(total_500_final)[4] <- "Revenues"
colnames(total_500_final)[6] <- "Total_SH_Equity"
#Delete the variables we will not need
total_500_final$Revenues...1 <- NULL #Revenues %
total_500_final$company <- NULL #company name
total_500_final$url<- NULL # company url
#we upload the libraries beneath that we will use in the analysis
library(ggplot2)
library(reshape2)
library(DAAG)
## Loading required package: lattice
#Final number of observation and variables we will use
dim(total_500_final)
## [1] 408 727
#######################################################################################################
#we first see the summary of the Fortune variables and then we create their histogram so as to have a 
#good grasp of how they are distributed
ggplot(data=total_500_final,aes(x=Revenues))+geom_histogram(binwidth=50, colour = "green", fill ="darkgreen")

ggplot(data=total_500_final,aes(x=Assets))+geom_histogram(binwidth=100, colour = "red", fill ="darkred")

ggplot(data=total_500_final,aes(x=Market_Value))+geom_histogram(binwidth=100, colour = "blue", fill ="darkblue")

ggplot(data=total_500_final,aes(x=Total_SH_Equity))+geom_histogram(binwidth=100, colour = "purple", fill ="pink")

###############################################################################################
#We make plots to see how the variables we got from Fortune 500 are related with the Ranking
ggplot(total_500_final, aes(Assets,Ranking)) + geom_point(colour = "red")

ggplot(total_500_final, aes(Market_Value, Ranking)) + geom_point(colour = "blue")

ggplot(total_500_final, aes(Total_SH_Equity, Ranking)) + geom_point(colour = "purple")

ggplot(total_500_final, aes(Revenues, Ranking)) + geom_point(colour = "green")

#We can see that the Ranking has a linear relationship with the Revenues so we will use one of those 2 variables to check the relationships with the websites metrics
#In order to have a more clear look we also create a correlation diagram
total_500_fortune <- total_500_final[,c(1:5)]
library(corrplot)
library(caret)
sm <- cor(total_500_fortune)
sm
##                    Ranking      Assets Market_Value    Revenues
## Ranking          1.0000000 -0.36673307  -0.15959008 -0.67511457
## Assets          -0.3667331  1.00000000   0.16787320  0.43479882
## Market_Value    -0.1595901  0.16787320   1.00000000  0.31085660
## Revenues        -0.6751146  0.43479882   0.31085660  1.00000000
## Total_SH_Equity  0.1327272 -0.03638159  -0.02912268 -0.05616772
##                 Total_SH_Equity
## Ranking              0.13272724
## Assets              -0.03638159
## Market_Value        -0.02912268
## Revenues            -0.05616772
## Total_SH_Equity      1.00000000
corrplot(cor(total_500_fortune),method="number")

#From this plot we understand that the Ranking and the Revenues have very high correlation.
##########################################################################################################
#Firstly we will analyze the social media relevance with the sites.
#We will see how many of the sites have social media and what type of social media
#Facebook
social_media_facebook <- round(table(total_500_final$facebook)/408,3)
social_media_facebook
## 
##     0     1 
## 0.353 0.647
slicelable <- c(paste(35.3,"% no"),paste(64.7,"% yes"))
pie(social_media_facebook,label = slicelable,main="Share of companies with Facebook",col=rainbow(length(social_media_facebook)))

ggplot(total_500_final, aes(Revenues, facebook)) + geom_point(size=3, colour = "darkblue")

#Twitter
social_media_twitter <- round(table(total_500_final$twitter)/408,3)
social_media_twitter
## 
##     0     1 
## 0.314 0.686
slicelable <- c(paste(31.4,"% no"),paste(68.6,"% yes"))
pie(social_media_twitter,label = slicelable,main="Share of companies with Twitter",col=rainbow(length(social_media_twitter)))

ggplot(total_500_final, aes(Revenues, twitter)) + geom_point(size=3, colour = "darkgreen")

#Instagram
social_media_instagram <- round(table(total_500_final$instagram)/408,3)
social_media_instagram
## 
##     0     1 
## 0.777 0.223
slicelable <- c(paste(77.7,"% no"),paste(22.3,"% yes"))
pie(social_media_instagram,label = slicelable,main="Share of companies with Instagram",col=rainbow(length(social_media_instagram)))

ggplot(total_500_final, aes(Revenues, instagram)) + geom_point(size=3, colour = "pink")

#Pinterest
social_media_pinterest <- round(table(total_500_final$pinterest)/408,3)
social_media_pinterest
## 
##     0     1 
## 0.902 0.098
slicelable <- c(paste(90.2,"% no"),paste(9.8,"% yes"))
pie(social_media_pinterest,label = slicelable,main="Share of companies with Pinterest",col=rainbow(length(social_media_pinterest)))

ggplot(total_500_final, aes(Revenues, pinterest)) + geom_point(size=3, colour = "darkred")

#Youtube
social_media_youtube <- round(table(total_500_final$youtube)/408,3)
social_media_youtube
## 
##     0     1 
## 0.417 0.583
slicelable <- c(paste(41.7,"% no"),paste(58.3,"% yes"))
pie(social_media_youtube,label = slicelable,main="Share of companies with Youtube",col=rainbow(length(social_media_youtube)))

ggplot(total_500_final, aes(Revenues, youtube)) + geom_point(size=3, colour = "red")

#LinkedIn
social_media_linkedin <- round(table(total_500_final$linkedin)/408,3)
social_media_linkedin
## 
##     0     1 
## 0.429 0.571
slicelable <- c(paste(42.9,"% no"),paste(57.1,"% yes"))
pie(social_media_linkedin,label = slicelable,main="Share of companies with Linkedin",col=rainbow(length(social_media_linkedin)))

ggplot(total_500_final, aes(Revenues, linkedin)) + geom_point(size=3, colour = "blue")

#And we can also see for correlations
total_500_social_media <- total_500_final[,c(4,10:15)]
library(corrplot)
library(caret)
sm <- cor(total_500_social_media)
sm
##               Revenues   facebook  instagram     linkedin   pinterest
## Revenues   1.000000000 0.01121852 0.05771665 -0.008311532  0.09686843
## facebook   0.011218524 1.00000000 0.35874256  0.520581725  0.24349238
## instagram  0.057716654 0.35874256 1.00000000  0.143134960  0.37774489
## linkedin  -0.008311532 0.52058172 0.14313496  1.000000000 -0.03069495
## pinterest  0.096868426 0.24349238 0.37774489 -0.030694951  1.00000000
## twitter    0.002185367 0.67230226 0.32419034  0.577378625  0.20514804
## youtube    0.074833925 0.54096275 0.32145351  0.482997415  0.19504737
##               twitter    youtube
## Revenues  0.002185367 0.07483393
## facebook  0.672302259 0.54096275
## instagram 0.324190344 0.32145351
## linkedin  0.577378625 0.48299741
## pinterest 0.205148042 0.19504737
## twitter   1.000000000 0.52142857
## youtube   0.521428571 1.00000000
corrplot(cor(total_500_social_media),method="number")

#we see that facebook has correlation more than 50% with twitter, youtube and linkedin
#And that the smallest correlations are those of pinterest and instagram
#########################################################################################################
#We will now check the links by creating an histogram
#Then we create ggplots in order to see in what frequency the links appear
par(mfrow=c(1,1))
library(ggplot2)
ggplot(data=total_500_final,aes(x=total.links))+geom_histogram(binwidth=50, colour = "darkblue", fill ="blue")

ggplot(total_500_final, aes(Revenues, total.links)) + geom_point(size=3, colour = "darkblue")

ggplot(data=total_500_final,aes(x=external))+geom_histogram(binwidth=50, colour = "darkred", fill ="red")

ggplot(total_500_final, aes(Revenues, external)) + geom_point(size=3, colour = "darkred")

ggplot(data=total_500_final,aes(x=internal))+geom_histogram(binwidth=50, colour = "darkgreen", fill ="green")

ggplot(total_500_final, aes(Revenues, internal)) + geom_point(size=3, colour = "darkgreen")

#And we can also see for correlations
total_500_links <- total_500_final[,c(4,21:23)]
library(corrplot)
library(caret)
tl <- cor(total_500_links)
tl
##               Revenues     external     internal total.links
## Revenues    1.00000000  0.034100506  0.004559950  0.01538199
## external    0.03410051  1.000000000 -0.002593961  0.32202419
## internal    0.00455995 -0.002593961  1.000000000  0.94589294
## total.links 0.01538199  0.322024191  0.945892937  1.00000000
corrplot(cor(total_500_links),method="number")

#We can see that the total links with the internal links have a correlation almost 95%.
#So we will not include the total links in the regression model
#########################################################################################################
#Now we will see the loading time per site
ggplot(data=total_500_final,aes(x=loading.time))+geom_histogram(binwidth=1, colour = "pink", fill ="purple")

ggplot(total_500_final, aes(Revenues, loading.time)) + geom_point(size=3, colour = "purple")

#########################################################################################################
#Now we will see the total words, the unique words and the sentences how are distributed alone and in relationhsip with the revenues.
ggplot(data=total_500_final,aes(x=Sentences))+geom_histogram(binwidth=50, colour = "darkred", fill ="red")

ggplot(total_500_final, aes(Revenues, Sentences)) + geom_point(size=3, colour = "purple")

#########################
ggplot(data=total_500_final,aes(x=Unique.words))+geom_histogram(binwidth=50, colour = "darkred", fill ="red")

ggplot(total_500_final, aes(Revenues, Unique.words)) + geom_point(size=3, colour = "purple")

#########################
ggplot(data=total_500_final,aes(x=Words))+geom_histogram(binwidth=50, colour = "darkred", fill ="red")

ggplot(total_500_final, aes(Revenues, Words)) + geom_point(size=3, colour = "purple")

#############################
#And we can also see for correlations
total_500_lt_w <- total_500_final[,c(4,18:20,727)]
library(corrplot)
library(caret)
tl <- cor(total_500_lt_w)
tl
##                 Revenues   Sentences Unique.words       Words loading.time
## Revenues      1.00000000 -0.01183819  -0.04362118 -0.03479049   -0.1212650
## Sentences    -0.01183819  1.00000000   0.69454327  0.78851979    0.1497520
## Unique.words -0.04362118  0.69454327   1.00000000  0.93243940    0.1994296
## Words        -0.03479049  0.78851979   0.93243940  1.00000000    0.1857922
## loading.time -0.12126500  0.14975205   0.19942956  0.18579225    1.0000000
corrplot(cor(total_500_lt_w),method="number")

################################

#Next we will check the Flesh Measure alone and in relationship with revenues
ggplot(data=total_500_final,aes(x=Flesh_Mesaure))+geom_histogram(binwidth=50, colour = "darkred", fill ="red")

ggplot(total_500_final, aes(Revenues, Flesh_Mesaure)) + geom_point(size=3, colour = "purple")

############################
total_500_final$Readability <- gsub("Very easy", "01_VE", total_500_final$Readability )
total_500_final$Readability <- gsub("Easy", "02_E", total_500_final$Readability )
total_500_final$Readability <- gsub("Fairly easy", "03_FE", total_500_final$Readability )
total_500_final$Readability <- gsub("Standard", "04_St", total_500_final$Readability )
total_500_final$Readability <- gsub("Fairly difficult", "05_FD", total_500_final$Readability )
total_500_final$Readability <- gsub("Difficult", "06_D", total_500_final$Readability )
total_500_final$Readability <- gsub("Very Confusing", "07_VC", total_500_final$Readability )
barplot(table(total_500_final$Readability),col ="dark red")

total_500_final$Readability <- gsub("01_VE","1", total_500_final$Readability )
total_500_final$Readability <- gsub("02_E", "2", total_500_final$Readability )
total_500_final$Readability <- gsub("03_FE", "3", total_500_final$Readability )
total_500_final$Readability <- gsub("04_St", "4", total_500_final$Readability )
total_500_final$Readability <- gsub("05_FD", "5", total_500_final$Readability )
total_500_final$Readability <- gsub("06_D", "6" ,total_500_final$Readability )
total_500_final$Readability <- gsub("07_VC", "7",total_500_final$Readability )
total_500_final$Readability <- as.numeric(total_500_final$Readability )
ggplot(data=total_500_final,aes(x=Readability))+geom_bar(binwidth=1, colour = "darkred", fill ="red")
## Warning: `geom_bar()` no longer has a `binwidth` parameter. Please use
## `geom_histogram()` instead.

ggplot(total_500_final, aes(Revenues, Readability)) + geom_point(size=3, colour = "purple")

#And we can also see for correlations
total_500_r <- total_500_final[,c(4,16,17)]
library(corrplot)
library(caret)
tl <- cor(total_500_r)
tl
##                  Revenues Flesh_Mesaure Readability
## Revenues       1.00000000    0.02476229 -0.02694931
## Flesh_Mesaure  0.02476229    1.00000000 -0.17094994
## Readability   -0.02694931   -0.17094994  1.00000000
corrplot(cor(total_500_r),method="number")

#########################################################################################################
#Now we will see the number of errors and warnings alone and in relationship with the Revenues
ggplot(data=total_500_final,aes(x=number_of_errors))+geom_histogram(binwidth=50, colour = "red")

ggplot(total_500_final, aes(Revenues, number_of_errors)) + geom_point(size=3, colour = "dark red")

ggplot(data=total_500_final,aes(x=number_of_warning))+geom_histogram(binwidth=20, colour = "red")

ggplot(total_500_final, aes(Revenues, number_of_warning)) + geom_point(size=3, colour = "dark blue")

#########################################################################################################
#########################################################################################################
#Now we will see the non.document.error and the page not opened variables alone and in relationship with the Revenues
ggplot(data=total_500_final,aes(x=non.document.error))+geom_histogram(binwidth=1, colour = "red")

ggplot(total_500_final, aes(Revenues, non.document.error)) + geom_point(size=1, colour = "dark red")

ggplot(data=total_500_final,aes(x=The_page_opened))+geom_histogram(binwidth=1, colour = "red")

ggplot(total_500_final, aes(Revenues, The_page_opened)) + geom_point(size=3, colour = "dark blue")

#In the page not opened we can see that the variable has only the price 1 that means that the page opened so there is no point in using it in the analysis as it does not affect the outcome
#########################################################################################################
#And we can also see for correlations
total_500_html <- total_500_final[,c(4,7:9)]
library(corrplot)
library(caret)
tl <- cor(total_500_html)
tl
##                       Revenues non.document.error number_of_errors
## Revenues            1.00000000         -0.0748407        0.0800205
## non.document.error -0.07484070          1.0000000       -0.2545301
## number_of_errors    0.08002050         -0.2545301        1.0000000
## number_of_warning   0.09505013         -0.2242315        0.2309578
##                    number_of_warning
## Revenues                  0.09505013
## non.document.error       -0.22423152
## number_of_errors          0.23095778
## number_of_warning         1.00000000
corrplot(cor(total_500_html),method="number")

#Now we will see the total images alone and in relationship with the revenues
ggplot(data=total_500_final,aes(x=total.images))+geom_histogram(binwidth=100, colour = "darkred", fill ="red")

ggplot(total_500_final, aes(Revenues, total.images)) + geom_point(size=3, colour = "dark blue")

#########################################################################################################
#We will see now the frequency of image types that is being used

par(mfrow=c(1,1))
k = c(717:725)
for(i in 1:9){
  a <- k[i]
  image_type<- round(table(total_500_final[,a])/408,3)
  barplot(image_type,xlab=names(total_500_final)[a],ylab = "Shares of images per site", col = "dark green")}

#It is obvious that the most common images type are .jpg, gif and .png
#We will check now the types in relationship with the revenues
ggplot(total_500_final, aes(Revenues, .bmp)) + geom_point(size=3, colour = "dark blue")

ggplot(total_500_final, aes(Revenues, .dib)) + geom_point(size=3, colour = "dark blue")

ggplot(total_500_final, aes(Revenues, .gif)) + geom_point(size=3, colour = "dark blue")

ggplot(total_500_final, aes(Revenues, .jpe)) + geom_point(size=3, colour = "dark blue")

ggplot(total_500_final, aes(Revenues, .jpeg)) + geom_point(size=3, colour = "dark blue")

ggplot(total_500_final, aes(Revenues, .jpg)) + geom_point(size=3, colour = "dark blue")

ggplot(total_500_final, aes(Revenues, .png)) + geom_point(size=3, colour = "dark blue")

ggplot(total_500_final, aes(Revenues, .tif)) + geom_point(size=3, colour = "dark blue")

ggplot(total_500_final, aes(Revenues, .tiff)) + geom_point(size=3, colour = "dark blue")

#And we can also see for correlations
total_500_im<- total_500_final[,c(4,717:726)]
library(corrplot)
library(caret)
tl <- cor(total_500_im)
tl
##                  Revenues         .bmp          .dib         .gif
## Revenues      1.000000000  0.083489281  0.0877047067 -0.020930575
## .bmp          0.083489281  1.000000000 -0.0013011275 -0.005780172
## .dib          0.087704707 -0.001301127  1.0000000000  0.196433219
## .gif         -0.020930575 -0.005780172  0.1964332192  1.000000000
## .jpe          0.059660288 -0.003534780  0.9108504455  0.235836802
## .jpeg         0.059427022 -0.003482535  0.9108963639  0.236016392
## .jpg          0.004515870  0.013995542  0.0081915788 -0.006939606
## .png          0.022053238 -0.001301001  0.2204575537  0.164538688
## .tif         -0.002466003  0.050383371  0.0628334455  0.031167772
## .tiff        -0.030165817 -0.005659409 -0.0008571371  0.007252077
## total.images  0.051605339  0.018395189  0.7961253175  0.319131935
##                      .jpe        .jpeg         .jpg         .png
## Revenues      0.059660288  0.059427022  0.004515870  0.022053238
## .bmp         -0.003534780 -0.003482535  0.013995542 -0.001301001
## .dib          0.910850445  0.910896364  0.008191579  0.220457554
## .gif          0.235836802  0.236016392 -0.006939606  0.164538688
## .jpe          1.000000000  0.999991326 -0.008220505  0.231367993
## .jpeg         0.999991326  1.000000000 -0.008242305  0.231422560
## .jpg         -0.008220505 -0.008242305  1.000000000  0.244033499
## .png          0.231367993  0.231422560  0.244033499  1.000000000
## .tif          0.007431086  0.007587510  0.375321187  0.259001712
## .tiff        -0.005630343 -0.005548159  0.224429140  0.040635288
## total.images  0.855175392  0.855225420  0.413319367  0.529706228
##                      .tif         .tiff total.images
## Revenues     -0.002466003 -0.0301658169   0.05160534
## .bmp          0.050383371 -0.0056594093   0.01839519
## .dib          0.062833445 -0.0008571371   0.79612532
## .gif          0.031167772  0.0072520772   0.31913194
## .jpe          0.007431086 -0.0056303426   0.85517539
## .jpeg         0.007587510 -0.0055481589   0.85522542
## .jpg          0.375321187  0.2244291400   0.41331937
## .png          0.259001712  0.0406352880   0.52970623
## .tif          1.000000000  0.0222123897   0.34887215
## .tiff         0.022212390  1.0000000000   0.07827891
## total.images  0.348872154  0.0782789113   1.00000000
corrplot(cor(total_500_im),method="number")

#We will see now the frequency of image sizes that is being used
k = c()
#Check for sizes that are half and half divided in existing and not
for(i in 24:716){
  image_size<- round(table(total_500_final[,i]))
  if ((image_size[[1]]==408)==TRUE){
    k <- union(k, c(i))
  }}
#####################
#Number 24 is all onw price so we want use it
names(total_500_final)[24]
## [1] "X144x144"
total_500_final$X144x144 <- NULL
false_not_existing = c()
#Check for sizes that are less than half divided in existing and not
for(i in 24:715){
  image_size<- round(table(total_500_final[,i]))
  if ((image_size[[2]]<204)==TRUE){
    false_not_existing <- union(false_not_existing, c(i))
  }}
########################
#Now we will take the sizes that exist in less than half the instances and check graphically the deviations between the 408 sites
par(mfrow=c(3,3))
for(i in 1:416){
  a = false_not_existing[i]
  plot(total_500_final[,a],total_500_final$Revenues)
  image_size<- round(table(total_500_final[,a]))
  barplot(image_size,xlab=names(total_500_final)[a],ylab = "Has or not the size", col = "dark green")}

true_existing = c()
#Check for sizes that are more than half divided in existing and not
for(i in 24:715){
  image_size<- round(table(total_500_final[,i]))
  if ((image_size[[2]]>204)==TRUE){
    true_existing <- union(true_existing, c(i))
  }}
#Now we will take the sizes that exist in more than half the instances and check graphically the deviations between the 408 sites
par(mfrow=c(3,3))
for(i in 1:276){
  a = true_existing[i]
  image_size<- round(table(total_500_final[,a]))
  plot(total_500_final[,a],total_500_final$Revenues)
  barplot(image_size,xlab=names(total_500_final)[a],ylab = "Has or not the size", col = "dark green")}

#By checking the above plots we can see that the 24 first sizes do appear to have some differentiation regarding the revenues. While most sites do have those sizes when it comes to the high revienues they do not have them
par(mfrow=c(3,3))
keep = c()
for(i in 1:24){
  a = true_existing[i]
  keep = union (keep, c(a))}
keep
##  [1] 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46
## [24] 47
#As we can see they are the variables from 24 to 47 and these are the only sizes we are going to keep for the further analysis
total_500_final <- total_500_final[,-c(48:715)]
#Also we remove the other Fortune 500 variables since they will interfer in the outcome of the model and we keep only the variable we want to examine the Revenues
total_500_final$Market_Value <- NULL
total_500_final$Assets <- NULL
total_500_final$Ranking <- NULL 
total_500_final$Total_SH_Equity <- NULL
total_500_final$The_page_opened <- NULL
#We split the set to training and test set
library(caret)
set.seed(20)
sampling_vector <- createDataPartition(total_500_final$Revenues, p = 0.85, list = FALSE)
total_500_final_train <- total_500_final[sampling_vector,]
total_500_final_test <- total_500_final[-sampling_vector,]
#We will try to create a regression model to see which of the variables of the websites play the most important part regarding the Ranking of the company. 
#We create the empty lm model
model_null = lm(Revenues~1,data=total_500_final_train)
summary(model_null)
## 
## Call:
## lm(formula = Revenues ~ 1, data = total_500_final_train)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -17.16 -15.24 -11.17  -1.43 211.43 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   22.288      1.703   13.09   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 31.77 on 347 degrees of freedom
#####################################################################################################
#LASSO and Logistic Regression models
library(glmnet)
## Loading required package: Matrix
## Loading required package: foreach
## Loaded glmnet 2.0-2
#We create a full model for the variable Ranking
full <- lm(Revenues~.,data=total_500_final_train)
summary(full)
## 
## Call:
## lm(formula = Revenues ~ ., data = total_500_final_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -17.199  -8.486  -3.461   2.223  65.721 
## 
## Coefficients: (14 not defined because of singularities)
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        438.837989  29.754450  14.749  < 2e-16 ***
## non.document.error  -0.973835   1.984634  -0.491 0.623996    
## number_of_errors     0.012911   0.010948   1.179 0.239167    
## number_of_warning   -0.020989   0.038979  -0.538 0.590653    
## facebook            -3.004443   2.410328  -1.246 0.213530    
## instagram            2.605986   2.234726   1.166 0.244460    
## linkedin            -0.325461   2.191702  -0.148 0.882047    
## pinterest            1.109936   3.221429   0.345 0.730669    
## twitter              1.567012   2.451651   0.639 0.523189    
## youtube              2.272481   2.072409   1.097 0.273696    
## Flesh_Mesaure        0.001529   0.004208   0.363 0.716631    
## Readability          1.613550   0.770141   2.095 0.036973 *  
## Sentences           -0.006214   0.010664  -0.583 0.560540    
## Unique.words        -0.005883   0.019639  -0.300 0.764725    
## Words                0.002363   0.004897   0.482 0.629833    
## external             0.004634   0.017992   0.258 0.796924    
## internal             0.001545   0.007942   0.195 0.845906    
## total.links                NA         NA      NA       NA    
## X15x75             -20.966921  20.586365  -1.018 0.309244    
## X8x15              -30.814241  20.765759  -1.484 0.138856    
## X44x556            -17.975428  20.815629  -0.864 0.388503    
## X1x1                       NA         NA      NA       NA    
## X800x1200           -9.979089  18.025681  -0.554 0.580250    
## autox100.           -3.048954  18.426054  -0.165 0.868682    
## X24pxx133px        -15.566911  18.548403  -0.839 0.401973    
## X21pxx173px                NA         NA      NA       NA    
## X46x214                    NA         NA      NA       NA    
## X49x49                     NA         NA      NA       NA    
## X50x45              -6.206753  18.046674  -0.344 0.731134    
## X400x300           -12.942588  18.222831  -0.710 0.478091    
## X292pxx292px        -6.790009  14.896169  -0.456 0.648838    
## X200pxx200px               NA         NA      NA       NA    
## X1279pxx984px              NA         NA      NA       NA    
## X300pxx1500px              NA         NA      NA       NA    
## X29x29              -7.177375  14.533766  -0.494 0.621769    
## X115x223            -8.312448  18.150382  -0.458 0.647291    
## X160x233                   NA         NA      NA       NA    
## X300x993                   NA         NA      NA       NA    
## X41x192                    NA         NA      NA       NA    
## X28x221                    NA         NA      NA       NA    
## X15x12                     NA         NA      NA       NA    
## X60x60             -75.505052  14.921561  -5.060  7.2e-07 ***
## .bmp                 2.398250   0.633335   3.787 0.000183 ***
## .dib                 0.316324   1.107796   0.286 0.775419    
## .gif                -0.064718   0.066521  -0.973 0.331361    
## .jpe                -0.610764   4.148684  -0.147 0.883055    
## .jpeg                0.640994   4.149986   0.154 0.877350    
## .jpg                 0.007073   0.024010   0.295 0.768499    
## .png                -0.011736   0.031687  -0.370 0.711358    
## .tif                -0.009165   0.042009  -0.218 0.827451    
## .tiff               -3.166227   5.182089  -0.611 0.541652    
## total.images               NA         NA      NA       NA    
## loading.time        -5.750583   2.043199  -2.814 0.005199 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.37 on 309 degrees of freedom
## Multiple R-squared:  0.8179, Adjusted R-squared:  0.7955 
## F-statistic: 36.52 on 38 and 309 DF,  p-value: < 2.2e-16
x <- model.matrix(full) [,-1]
dim(x)
## [1] 348  52
lasso <- glmnet (x, total_500_final_train$Revenues)
par(mfrow=c(1,1),no.readonly = TRUE)
plot(lasso, xvar='lambda', label=T)

lassob <- cv.glmnet(x,total_500_final_train$Revenues)
lassob$lambda.min
## [1] 1.497886
lassob$lambda.1se
## [1] 6.046992
plot(lassob)

#We see the coefficients for lamda min
blasso <- coef(lassob, s="lambda.min")
blasso
## 53 x 1 sparse Matrix of class "dgCMatrix"
##                                1
## (Intercept)         3.947708e+02
## non.document.error  .           
## number_of_errors    .           
## number_of_warning   .           
## facebook            .           
## instagram           .           
## linkedin            .           
## pinterest           .           
## twitter             .           
## youtube             .           
## Flesh_Mesaure       .           
## Readability         .           
## Sentences           .           
## Unique.words        .           
## Words               .           
## external            .           
## internal            .           
## total.links         .           
## X15x75             -6.454344e+00
## X8x15              -2.581794e+01
## X44x556            -2.274898e+01
## X1x1               -1.817242e-10
## X800x1200          -4.024409e+00
## autox100.          -2.731837e+00
## X24pxx133px        -1.264747e+01
## X21pxx173px        -8.311505e-14
## X46x214            -2.509133e-14
## X49x49             -1.515554e-01
## X50x45             -4.483822e+00
## X400x300           -1.735377e+01
## X292pxx292px       -7.628580e+00
## X200pxx200px        .           
## X1279pxx984px       .           
## X300pxx1500px       .           
## X29x29             -5.198336e+00
## X115x223           -1.075696e+01
## X160x233           -1.736363e-14
## X300x993           -3.268447e-14
## X41x192             .           
## X28x221             .           
## X15x12              .           
## X60x60             -6.896751e+01
## .bmp                1.353565e+00
## .dib                .           
## .gif                .           
## .jpe                9.091761e-03
## .jpeg               .           
## .jpg                .           
## .png                .           
## .tif                .           
## .tiff               .           
## total.images        .           
## loading.time       -1.663331e+00
dim(blasso)
## [1] 53  1
zblasso <- blasso[-1] * apply(x,2,sd)
zbolt <- coef (full) [-1] * apply (x,2,sd)
azbolt <- abs(zbolt)
sum(azbolt)
## [1] NA
#since the sum is NA that means we have to substract some variables
# in order to find which variables to substract we run the coefficients and we see which of them has NA as result
coef(full)
##        (Intercept) non.document.error   number_of_errors 
##      438.837988834       -0.973834742        0.012911122 
##  number_of_warning           facebook          instagram 
##       -0.020988515       -3.004442960        2.605986169 
##           linkedin          pinterest            twitter 
##       -0.325461120        1.109935889        1.567011663 
##            youtube      Flesh_Mesaure        Readability 
##        2.272481175        0.001528797        1.613549502 
##          Sentences       Unique.words              Words 
##       -0.006213607       -0.005882824        0.002362520 
##           external           internal        total.links 
##        0.004633997        0.001544738                 NA 
##             X15x75              X8x15            X44x556 
##      -20.966920757      -30.814240798      -17.975427612 
##               X1x1          X800x1200          autox100. 
##                 NA       -9.979088664       -3.048953972 
##        X24pxx133px        X21pxx173px            X46x214 
##      -15.566910594                 NA                 NA 
##             X49x49             X50x45           X400x300 
##                 NA       -6.206752976      -12.942588163 
##       X292pxx292px       X200pxx200px      X1279pxx984px 
##       -6.790009197                 NA                 NA 
##      X300pxx1500px             X29x29           X115x223 
##                 NA       -7.177375043       -8.312447514 
##           X160x233           X300x993            X41x192 
##                 NA                 NA                 NA 
##            X28x221             X15x12             X60x60 
##                 NA                 NA      -75.505051947 
##               .bmp               .dib               .gif 
##        2.398249510        0.316324310       -0.064718207 
##               .jpe              .jpeg               .jpg 
##       -0.610763627        0.640994242        0.007073258 
##               .png               .tif              .tiff 
##       -0.011735917       -0.009164584       -3.166227158 
##       total.images       loading.time 
##                 NA       -5.750583435
#Now we create a new model with only the variables with coef different from NA
full_2 <- lm(Revenues~. - total.images - total.links - X1x1 - X21pxx173px - X46x214 - X49x49 - X200pxx200px - X1279pxx984px - X300pxx1500px - X160x233 -  X300x993 - X41x192 - X28x221 - X15x12,data=total_500_final_train)
summary(full_2)
## 
## Call:
## lm(formula = Revenues ~ . - total.images - total.links - X1x1 - 
##     X21pxx173px - X46x214 - X49x49 - X200pxx200px - X1279pxx984px - 
##     X300pxx1500px - X160x233 - X300x993 - X41x192 - X28x221 - 
##     X15x12, data = total_500_final_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -17.199  -8.486  -3.461   2.223  65.721 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        438.837989  29.754450  14.749  < 2e-16 ***
## non.document.error  -0.973835   1.984634  -0.491 0.623996    
## number_of_errors     0.012911   0.010948   1.179 0.239167    
## number_of_warning   -0.020989   0.038979  -0.538 0.590653    
## facebook            -3.004443   2.410328  -1.246 0.213530    
## instagram            2.605986   2.234726   1.166 0.244460    
## linkedin            -0.325461   2.191702  -0.148 0.882047    
## pinterest            1.109936   3.221429   0.345 0.730669    
## twitter              1.567012   2.451651   0.639 0.523189    
## youtube              2.272481   2.072409   1.097 0.273696    
## Flesh_Mesaure        0.001529   0.004208   0.363 0.716631    
## Readability          1.613550   0.770141   2.095 0.036973 *  
## Sentences           -0.006214   0.010664  -0.583 0.560540    
## Unique.words        -0.005883   0.019639  -0.300 0.764725    
## Words                0.002363   0.004897   0.482 0.629833    
## external             0.004634   0.017992   0.258 0.796924    
## internal             0.001545   0.007942   0.195 0.845906    
## X15x75             -20.966921  20.586365  -1.018 0.309244    
## X8x15              -30.814241  20.765759  -1.484 0.138856    
## X44x556            -17.975428  20.815629  -0.864 0.388503    
## X800x1200           -9.979089  18.025681  -0.554 0.580250    
## autox100.           -3.048954  18.426054  -0.165 0.868682    
## X24pxx133px        -15.566911  18.548403  -0.839 0.401973    
## X50x45              -6.206753  18.046674  -0.344 0.731134    
## X400x300           -12.942588  18.222831  -0.710 0.478091    
## X292pxx292px        -6.790009  14.896169  -0.456 0.648838    
## X29x29              -7.177375  14.533766  -0.494 0.621769    
## X115x223            -8.312448  18.150382  -0.458 0.647291    
## X60x60             -75.505052  14.921561  -5.060  7.2e-07 ***
## .bmp                 2.398250   0.633335   3.787 0.000183 ***
## .dib                 0.316324   1.107796   0.286 0.775419    
## .gif                -0.064718   0.066521  -0.973 0.331361    
## .jpe                -0.610764   4.148684  -0.147 0.883055    
## .jpeg                0.640994   4.149986   0.154 0.877350    
## .jpg                 0.007073   0.024010   0.295 0.768499    
## .png                -0.011736   0.031687  -0.370 0.711358    
## .tif                -0.009165   0.042009  -0.218 0.827451    
## .tiff               -3.166227   5.182089  -0.611 0.541652    
## loading.time        -5.750583   2.043199  -2.814 0.005199 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.37 on 309 degrees of freedom
## Multiple R-squared:  0.8179, Adjusted R-squared:  0.7955 
## F-statistic: 36.52 on 38 and 309 DF,  p-value: < 2.2e-16
x <- model.matrix(full_2) [,-c(18,22,28,26,27,34,32,33,41,37,38,39,40,52)]
dim(x)
## [1] 348  28
lasso <- glmnet (x, total_500_final_train$Revenues)
par(mfrow=c(1,1),no.readonly = TRUE)
plot(lasso, xvar='lambda', label=T)

lassob <- cv.glmnet(x,total_500_final_train$Revenues)
lassob$lambda.min
## [1] 1.804209
lassob$lambda.1se
## [1] 4.167957
plot(lassob)

#coefiecinets for lammda min
blasso <- coef(lassob, s="lambda.min")
blasso
## 29 x 1 sparse Matrix of class "dgCMatrix"
##                               1
## (Intercept)        380.68399196
## (Intercept)          .         
## non.document.error   .         
## number_of_errors     .         
## number_of_warning    .         
## facebook             .         
## instagram            .         
## linkedin             .         
## pinterest            .         
## twitter              .         
## youtube              .         
## Flesh_Mesaure        .         
## Readability          .         
## Sentences            .         
## Unique.words         .         
## Words                .         
## external             .         
## internal             .         
## X8x15              -27.76901174
## X44x556            -22.67638827
## X800x1200           -3.39121192
## X24pxx133px        -14.48085601
## X50x45              -4.29553809
## X400x300           -27.12156885
## X60x60             -82.43258867
## .bmp                 1.11787313
## .dib                 0.04443869
## .jpg                 .         
## .png                 .
dim(blasso)
## [1] 29  1
zblasso <- blasso[-1] * apply(x,2,sd)
zbolt <- coef (full_2) [-1] * apply (x,2,sd)
## Warning in coef(full_2)[-1] * apply(x, 2, sd): longer object length is not
## a multiple of shorter object length
azbolt <- abs(zbolt)
sum(azbolt)
## [1] 5970.87
s <- sum(abs(zblasso))/sum(abs(azbolt))
s
## [1] 0.005288955
full_3 <- lm(Revenues~1 +X8x15  +X44x556 +X800x1200 +X24pxx133px +X50x45 +X400x300 +X60x60  +.bmp +.dib ,data=total_500_final_train)
summary(full_3)
## 
## Call:
## lm(formula = Revenues ~ 1 + X8x15 + X44x556 + X800x1200 + X24pxx133px + 
##     X50x45 + X400x300 + X60x60 + .bmp + .dib, data = total_500_final_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.892  -8.676  -4.842   1.878  66.817 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 428.8923    20.2160  21.215  < 2e-16 ***
## X8x15       -41.0270    17.4942  -2.345   0.0196 *  
## X44x556     -24.1340    20.2005  -1.195   0.2330    
## X800x1200    -5.3723    16.4937  -0.326   0.7448    
## X24pxx133px -15.7302    13.0394  -1.206   0.2285    
## X50x45       -4.8865    17.4942  -0.279   0.7802    
## X400x300    -29.5016    15.1628  -1.946   0.0525 .  
## X60x60      -85.9726     5.1401 -16.726  < 2e-16 ***
## .bmp          2.5989     0.6183   4.203 3.37e-05 ***
## .dib          0.9183     0.3771   2.435   0.0154 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.28 on 338 degrees of freedom
## Multiple R-squared:  0.8032, Adjusted R-squared:  0.7979 
## F-statistic: 153.2 on 9 and 338 DF,  p-value: < 2.2e-16
ad_r_sq_f3 <- summary(full_3)$adj.r.squared
aic_f3 <- AIC(full_3)
plot(full_3,which=1:3)
## Warning: not plotting observations with leverage one:
##   4

##############################################
blassob <- coef(lassob, s="lambda.1se")
blassob
## 29 x 1 sparse Matrix of class "dgCMatrix"
##                              1
## (Intercept)        317.9194224
## (Intercept)          .        
## non.document.error   .        
## number_of_errors     .        
## number_of_warning    .        
## facebook             .        
## instagram            .        
## linkedin             .        
## pinterest            .        
## twitter              .        
## youtube              .        
## Flesh_Mesaure        .        
## Readability          .        
## Sentences            .        
## Unique.words         .        
## Words                .        
## external             .        
## internal             .        
## X8x15              -10.5491256
## X44x556            -20.5169976
## X800x1200           -0.9339037
## X24pxx133px        -12.8259711
## X50x45              -3.5213411
## X400x300           -25.8836893
## X60x60             -76.2263582
## .bmp                 .        
## .dib                 .        
## .jpg                 .        
## .png                 .
zblassob <- blassob[-1] * apply(x,2,sd)
zboltb <- coef (full_2) [-1] * apply (x,2,sd)
## Warning in coef(full_2)[-1] * apply(x, 2, sd): longer object length is not
## a multiple of shorter object length
s <- sum(abs(zblassob))/sum(abs(zboltb))
s
## [1] 0.004421009
#The model based on the lasso method by taking the lambda.1se is the null model only with the intercept
full_4 <- lm(Revenues~1 +X8x15  +X44x556 +X800x1200 +X24pxx133px +X50x45 +X400x300 +X60x60 ,data=total_500_final_train)
summary(full_4)
## 
## Call:
## lm(formula = Revenues ~ 1 + X8x15 + X44x556 + X800x1200 + X24pxx133px + 
##     X50x45 + X400x300 + X60x60, data = total_500_final_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.940  -9.004  -5.191   1.943  66.430 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  428.505     20.847  20.554   <2e-16 ***
## X8x15        -41.027     18.041  -2.274   0.0236 *  
## X44x556      -24.134     20.831  -1.159   0.2475    
## X800x1200     -5.372     17.009  -0.316   0.7523    
## X24pxx133px  -15.730     13.447  -1.170   0.2429    
## X50x45        -4.887     18.041  -0.271   0.7867    
## X400x300     -28.009     15.624  -1.793   0.0739 .  
## X60x60       -87.077      5.271 -16.521   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.73 on 340 degrees of freedom
## Multiple R-squared:  0.7894, Adjusted R-squared:  0.7851 
## F-statistic: 182.1 on 7 and 340 DF,  p-value: < 2.2e-16
ad_r_sq_f4 <- summary(full_4)$adj.r.squared
aic_f4 <- AIC(full_4)
plot(full_4,which=1:3)
## Warning: not plotting observations with leverage one:
##   4

###############################################
#We use the "both" method to compare the full_3 model with the null model to see how many variables are indeed important
model_a <- step(model_null, scope = list(lower = model_null, upper=full_2), direction = "both")
## Start:  AIC=2408.24
## Revenues ~ 1
## 
##                      Df Sum of Sq    RSS    AIC
## + X60x60              1    249797 100551 1975.8
## + X115x223            1    247381 102967 1984.1
## + X29x29              1    237600 112748 2015.7
## + X292pxx292px        1    220892 129456 2063.8
## + X400x300            1    207118 143231 2099.0
## + X50x45              1    195591 154758 2125.9
## + X24pxx133px         1    170701 179647 2177.8
## + autox100.           1    154606 195743 2207.7
## + X800x1200           1    121714 228634 2261.7
## + X44x556             1    105033 245316 2286.2
## + X8x15               1     80446 269903 2319.5
## + X15x75              1     44830 305518 2362.6
## + loading.time        1      4505 345844 2405.7
## + number_of_warning   1      2937 347411 2407.3
## + .bmp                1      2746 347602 2407.5
## + .dib                1      2728 347620 2407.5
## + non.document.error  1      2320 348028 2407.9
## <none>                            350348 2408.2
## + number_of_errors    1      1473 348875 2408.8
## + .jpe                1      1398 348951 2408.8
## + .jpeg               1      1384 348964 2408.9
## + Words               1      1122 349226 2409.1
## + Unique.words        1      1019 349329 2409.2
## + pinterest           1       909 349439 2409.3
## + Sentences           1       824 349525 2409.4
## + youtube             1       648 349701 2409.6
## + instagram           1       448 349900 2409.8
## + .gif                1       245 350103 2410.0
## + .png                1       217 350132 2410.0
## + .tiff               1       211 350138 2410.0
## + Flesh_Mesaure       1       199 350150 2410.0
## + facebook            1       140 350209 2410.1
## + linkedin            1       134 350214 2410.1
## + twitter             1       126 350222 2410.1
## + internal            1        29 350320 2410.2
## + .jpg                1        24 350324 2410.2
## + .tif                1         9 350340 2410.2
## + external            1         1 350348 2410.2
## + Readability         1         1 350348 2410.2
## 
## Step:  AIC=1975.85
## Revenues ~ X60x60
## 
##                      Df Sum of Sq    RSS    AIC
## + X44x556             1     18449  82102 1907.3
## + X24pxx133px         1     18204  82348 1908.3
## + autox100.           1     17756  82796 1910.2
## + X800x1200           1     17731  82821 1910.3
## + X50x45              1     17058  83494 1913.2
## + X400x300            1     16545  84007 1915.3
## + X8x15               1     16357  84194 1916.1
## + X292pxx292px        1     12453  88098 1931.8
## + X15x75              1      9903  90648 1941.8
## + X29x29              1      6348  94203 1955.2
## + .bmp                1      3599  96952 1965.2
## + loading.time        1      2728  97823 1968.3
## + X115x223            1      2630  97922 1968.6
## + .jpe                1      1382  99170 1973.0
## + .jpeg               1      1380  99171 1973.0
## + youtube             1      1026  99526 1974.3
## + non.document.error  1       813  99739 1975.0
## <none>                            100551 1975.8
## + instagram           1       543 100009 1976.0
## + .dib                1       528 100023 1976.0
## + Readability         1       468 100083 1976.2
## + number_of_errors    1       274 100278 1976.9
## + linkedin            1       269 100283 1976.9
## + external            1       196 100356 1977.2
## + number_of_warning   1        97 100455 1977.5
## + twitter             1        50 100502 1977.7
## + .gif                1        19 100532 1977.8
## + Flesh_Mesaure       1        18 100534 1977.8
## + .tiff               1        16 100535 1977.8
## + Words               1        10 100541 1977.8
## + internal            1         7 100545 1977.8
## + Unique.words        1         3 100549 1977.8
## + .tif                1         3 100549 1977.8
## + pinterest           1         2 100549 1977.8
## + .png                1         2 100550 1977.8
## + facebook            1         1 100550 1977.8
## + .jpg                1         1 100551 1977.8
## + Sentences           1         0 100551 1977.8
## - X60x60              1    249797 350348 2408.2
## 
## Step:  AIC=1907.3
## Revenues ~ X60x60 + X44x556
## 
##                      Df Sum of Sq    RSS    AIC
## + X400x300            1      6571  75531 1880.3
## + X50x45              1      6101  76002 1882.4
## + X292pxx292px        1      5235  76867 1886.4
## + X24pxx133px         1      5038  77064 1887.3
## + autox100.           1      3736  78366 1893.1
## + .bmp                1      3599  78503 1893.7
## + X29x29              1      2872  79230 1896.9
## + loading.time        1      2461  79641 1898.7
## + .jpe                1      1450  80653 1903.1
## + .jpeg               1      1446  80656 1903.1
## + X115x223            1      1352  80751 1903.5
## + X800x1200           1      1269  80833 1903.9
## + X8x15               1      1122  80980 1904.5
## + X15x75              1       947  81155 1905.3
## + youtube             1       836  81266 1905.7
## + .dib                1       778  81324 1906.0
## + non.document.error  1       535  81567 1907.0
## <none>                             82102 1907.3
## + instagram           1       386  81716 1907.7
## + number_of_errors    1       383  81719 1907.7
## + Readability         1       304  81798 1908.0
## + external            1       267  81835 1908.2
## + twitter             1       235  81867 1908.3
## + number_of_warning   1       121  81981 1908.8
## + linkedin            1       104  81998 1908.9
## + pinterest           1        89  82013 1908.9
## + facebook            1        50  82052 1909.1
## + .png                1        48  82054 1909.1
## + .jpg                1        35  82067 1909.2
## + Flesh_Mesaure       1        19  82083 1909.2
## + .tiff               1        16  82086 1909.2
## + Words               1        15  82088 1909.2
## + .tif                1        14  82088 1909.2
## + internal            1         8  82094 1909.3
## + .gif                1         3  82099 1909.3
## + Sentences           1         1  82101 1909.3
## + Unique.words        1         0  82102 1909.3
## - X44x556             1     18449 100551 1975.8
## - X60x60              1    163213 245316 2286.2
## 
## Step:  AIC=1880.27
## Revenues ~ X60x60 + X44x556 + X400x300
## 
##                      Df Sum of Sq    RSS    AIC
## + .bmp                1      3599  71932 1865.3
## + loading.time        1      1874  73657 1873.5
## + .jpe                1      1345  74186 1876.0
## + .jpeg               1      1344  74187 1876.0
## + .dib                1      1205  74326 1876.7
## + X8x15               1      1122  74409 1877.1
## + X15x75              1       947  74584 1877.9
## + X24pxx133px         1       600  74931 1879.5
## + youtube             1       538  74993 1879.8
## + autox100.           1       443  75088 1880.2
## <none>                             75531 1880.3
## + X29x29              1       315  75216 1880.8
## + X115x223            1       290  75241 1880.9
## + instagram           1       271  75260 1881.0
## + Readability         1       268  75263 1881.0
## + X292pxx292px        1       262  75269 1881.1
## + non.document.error  1       249  75282 1881.1
## + X50x45              1       227  75304 1881.2
## + external            1       209  75322 1881.3
## + number_of_errors    1       205  75326 1881.3
## + X800x1200           1       169  75362 1881.5
## + pinterest           1       156  75375 1881.5
## + twitter             1        79  75452 1881.9
## + .jpg                1        76  75455 1881.9
## + .tif                1        28  75503 1882.2
## + Flesh_Mesaure       1        23  75508 1882.2
## + internal            1        18  75513 1882.2
## + .png                1        17  75514 1882.2
## + .tiff               1        16  75515 1882.2
## + .gif                1         9  75522 1882.2
## + Words               1         5  75526 1882.2
## + Sentences           1         4  75527 1882.2
## + number_of_warning   1         2  75529 1882.3
## + Unique.words        1         2  75529 1882.3
## + facebook            1         1  75530 1882.3
## + linkedin            1         0  75531 1882.3
## - X400x300            1      6571  82102 1907.3
## - X44x556             1      8476  84007 1915.3
## - X60x60              1     59224 134755 2079.7
## 
## Step:  AIC=1865.28
## Revenues ~ X60x60 + X44x556 + X400x300 + .bmp
## 
##                      Df Sum of Sq    RSS    AIC
## + loading.time        1      1725  70207 1858.8
## + .jpe                1      1363  70569 1860.6
## + .jpeg               1      1362  70570 1860.6
## + .dib                1      1210  70722 1861.4
## + X8x15               1      1122  70810 1861.8
## + X15x75              1       947  70985 1862.7
## + X24pxx133px         1       600  71332 1864.4
## + autox100.           1       443  71489 1865.1
## <none>                             71932 1865.3
## + youtube             1       398  71534 1865.3
## + X29x29              1       315  71617 1865.8
## + instagram           1       310  71622 1865.8
## + X115x223            1       290  71642 1865.9
## + X292pxx292px        1       262  71670 1866.0
## + X50x45              1       227  71705 1866.2
## + Readability         1       224  71708 1866.2
## + non.document.error  1       202  71730 1866.3
## + number_of_errors    1       192  71740 1866.3
## + pinterest           1       176  71756 1866.4
## + twitter             1       175  71757 1866.4
## + X800x1200           1       169  71763 1866.5
## + external            1       151  71781 1866.5
## + .jpg                1        63  71869 1867.0
## + Words               1        32  71900 1867.1
## + facebook            1        22  71910 1867.2
## + Flesh_Mesaure       1        20  71912 1867.2
## + .png                1        17  71915 1867.2
## + linkedin            1        15  71917 1867.2
## + .tiff               1        14  71918 1867.2
## + .gif                1         7  71925 1867.2
## + .tif                1         5  71927 1867.3
## + Unique.words        1         4  71928 1867.3
## + Sentences           1         3  71928 1867.3
## + internal            1         2  71929 1867.3
## + number_of_warning   1         1  71930 1867.3
## - .bmp                1      3599  75531 1880.3
## - X400x300            1      6571  78503 1893.7
## - X44x556             1      8476  80408 1902.0
## - X60x60              1     59518 131450 2073.1
## 
## Step:  AIC=1858.84
## Revenues ~ X60x60 + X44x556 + X400x300 + .bmp + loading.time
## 
##                      Df Sum of Sq    RSS    AIC
## + .jpeg               1      1425  68782 1853.7
## + .jpe                1      1425  68782 1853.7
## + .dib                1      1370  68837 1854.0
## + X8x15               1       951  69256 1856.1
## + X15x75              1       870  69337 1856.5
## + X24pxx133px         1       671  69535 1857.5
## + autox100.           1       499  69708 1858.3
## + youtube             1       451  69756 1858.6
## <none>                             70207 1858.8
## + instagram           1       333  69874 1859.2
## + external            1       327  69880 1859.2
## + Readability         1       296  69910 1859.4
## + number_of_errors    1       292  69915 1859.4
## + twitter             1       261  69946 1859.5
## + X800x1200           1       258  69949 1859.6
## + X50x45              1       245  69962 1859.6
## + X29x29              1       230  69977 1859.7
## + X292pxx292px        1       194  70013 1859.9
## + Words               1       176  70031 1860.0
## + pinterest           1       150  70057 1860.1
## + non.document.error  1       149  70058 1860.1
## + .jpg                1       145  70062 1860.1
## + X115x223            1       137  70070 1860.2
## + Unique.words        1       114  70093 1860.3
## + internal            1       109  70098 1860.3
## + .png                1        91  70116 1860.4
## + linkedin            1        82  70125 1860.4
## + Sentences           1        60  70147 1860.5
## + .tif                1        29  70178 1860.7
## + facebook            1        22  70185 1860.7
## + .gif                1        11  70196 1860.8
## + number_of_warning   1        10  70197 1860.8
## + .tiff               1         5  70202 1860.8
## + Flesh_Mesaure       1         3  70204 1860.8
## - loading.time        1      1725  71932 1865.3
## - .bmp                1      3450  73657 1873.5
## - X400x300            1      6006  76213 1885.4
## - X44x556             1      8582  78789 1897.0
## - X60x60              1     60220 130427 2072.4
## 
## Step:  AIC=1853.7
## Revenues ~ X60x60 + X44x556 + X400x300 + .bmp + loading.time + 
##     .jpeg
## 
##                      Df Sum of Sq    RSS    AIC
## + X8x15               1       948  67834 1850.9
## + X15x75              1       868  67914 1851.3
## + X24pxx133px         1       635  68148 1852.5
## + autox100.           1       535  68247 1853.0
## + Readability         1       476  68306 1853.3
## <none>                             68782 1853.7
## + youtube             1       377  68405 1853.8
## + X800x1200           1       271  68511 1854.3
## + number_of_errors    1       242  68540 1854.5
## + X50x45              1       234  68548 1854.5
## + X29x29              1       229  68553 1854.5
## + twitter             1       217  68565 1854.6
## + instagram           1       204  68578 1854.7
## + X292pxx292px        1       192  68590 1854.7
## + .gif                1       160  68622 1854.9
## + .jpg                1       156  68626 1854.9
## + Words               1       151  68631 1854.9
## + linkedin            1       138  68644 1855.0
## + X115x223            1       135  68647 1855.0
## + Unique.words        1       130  68652 1855.0
## + external            1       125  68658 1855.1
## + non.document.error  1       120  68662 1855.1
## + internal            1        55  68727 1855.4
## + Sentences           1        46  68736 1855.5
## + .dib                1        35  68747 1855.5
## + pinterest           1        34  68748 1855.5
## + .tif                1        26  68756 1855.6
## + number_of_warning   1        18  68764 1855.6
## + facebook            1        10  68772 1855.7
## + .tiff               1         4  68778 1855.7
## + Flesh_Mesaure       1         1  68781 1855.7
## + .jpe                1         0  68782 1855.7
## + .png                1         0  68782 1855.7
## - .jpeg               1      1425  70207 1858.8
## - loading.time        1      1788  70570 1860.6
## - .bmp                1      3465  72247 1868.8
## - X400x300            1      5897  74679 1880.3
## - X44x556             1      8671  77453 1893.0
## - X60x60              1     60403 129185 2071.1
## 
## Step:  AIC=1850.87
## Revenues ~ X60x60 + X44x556 + X400x300 + .bmp + loading.time + 
##     .jpeg + X8x15
## 
##                      Df Sum of Sq    RSS    AIC
## + X24pxx133px         1       631  67203 1849.6
## + Readability         1       571  67263 1849.9
## + autox100.           1       532  67302 1850.1
## + youtube             1       431  67403 1850.7
## <none>                             67834 1850.9
## + twitter             1       310  67524 1851.3
## + instagram           1       309  67525 1851.3
## + number_of_errors    1       275  67559 1851.5
## + X800x1200           1       266  67568 1851.5
## + X15x75              1       265  67569 1851.5
## + X50x45              1       233  67601 1851.7
## + X29x29              1       233  67601 1851.7
## + X292pxx292px        1       196  67639 1851.9
## + Unique.words        1       173  67661 1852.0
## + Words               1       169  67665 1852.0
## + linkedin            1       166  67668 1852.0
## + .gif                1       162  67673 1852.0
## + .jpg                1       156  67678 1852.1
## + X115x223            1       141  67693 1852.2
## + non.document.error  1       122  67712 1852.2
## + external            1       119  67715 1852.3
## + Sentences           1        65  67770 1852.5
## + internal            1        51  67784 1852.6
## + facebook            1        36  67798 1852.7
## + pinterest           1        35  67799 1852.7
## + .dib                1        33  67801 1852.7
## + .tif                1        25  67809 1852.7
## + .tiff               1         4  67830 1852.8
## + Flesh_Mesaure       1         1  67833 1852.9
## + number_of_warning   1         0  67834 1852.9
## + .png                1         0  67834 1852.9
## + .jpe                1         0  67834 1852.9
## - X8x15               1       948  68782 1853.7
## - X44x556             1      1330  69164 1855.6
## - .jpeg               1      1422  69256 1856.1
## - loading.time        1      1613  69448 1857.0
## - .bmp                1      3472  71306 1866.2
## - X400x300            1      5922  73756 1878.0
## - X60x60              1     60364 128198 2070.4
## 
## Step:  AIC=1849.62
## Revenues ~ X60x60 + X44x556 + X400x300 + .bmp + loading.time + 
##     .jpeg + X8x15 + X24pxx133px
## 
##                      Df Sum of Sq    RSS    AIC
## + Readability         1       660  66543 1848.2
## + youtube             1       420  66783 1849.4
## <none>                             67203 1849.6
## + instagram           1       324  66880 1849.9
## + number_of_errors    1       275  66928 1850.2
## + twitter             1       269  66934 1850.2
## + X15x75              1       265  66938 1850.2
## + X29x29              1       231  66972 1850.4
## + X292pxx292px        1       194  67009 1850.6
## + Unique.words        1       179  67024 1850.7
## + .gif                1       177  67026 1850.7
## + linkedin            1       160  67043 1850.8
## + Words               1       159  67044 1850.8
## - X24pxx133px         1       631  67834 1850.9
## + .jpg                1       142  67062 1850.9
## + X115x223            1       139  67065 1850.9
## + non.document.error  1       121  67082 1851.0
## + external            1       105  67098 1851.1
## + pinterest           1        65  67138 1851.3
## + X800x1200           1        64  67140 1851.3
## + Sentences           1        63  67140 1851.3
## + internal            1        50  67153 1851.4
## + .dib                1        49  67154 1851.4
## + autox100.           1        41  67163 1851.4
## + facebook            1        23  67180 1851.5
## + .tif                1        23  67181 1851.5
## - X44x556             1       761  67964 1851.5
## + X50x45              1        15  67188 1851.5
## + .tiff               1         4  67199 1851.6
## + .png                1         3  67200 1851.6
## + number_of_warning   1         1  67202 1851.6
## + Flesh_Mesaure       1         1  67202 1851.6
## + .jpe                1         0  67203 1851.6
## - X8x15               1       944  68148 1852.5
## - .jpeg               1      1385  68588 1854.7
## - loading.time        1      1680  68883 1856.2
## - X400x300            1      1817  69020 1856.9
## - .bmp                1      3469  70672 1865.1
## - X60x60              1     60378 127581 2070.7
## 
## Step:  AIC=1848.19
## Revenues ~ X60x60 + X44x556 + X400x300 + .bmp + loading.time + 
##     .jpeg + X8x15 + X24pxx133px + Readability
## 
##                      Df Sum of Sq    RSS    AIC
## + instagram           1       510  66033 1847.5
## + number_of_errors    1       385  66159 1848.2
## <none>                             66543 1848.2
## + youtube             1       380  66163 1848.2
## + X29x29              1       270  66273 1848.8
## + X15x75              1       265  66278 1848.8
## + twitter             1       224  66319 1849.0
## + X292pxx292px        1       198  66345 1849.2
## + pinterest           1       178  66365 1849.2
## + non.document.error  1       160  66383 1849.3
## + X115x223            1       159  66384 1849.3
## + .jpg                1       158  66385 1849.4
## + .gif                1       141  66402 1849.5
## - X44x556             1       645  67189 1849.5
## - Readability         1       660  67203 1849.6
## + external            1        92  66451 1849.7
## + .dib                1        87  66456 1849.7
## + Words               1        85  66458 1849.7
## + X800x1200           1        61  66482 1849.9
## + linkedin            1        51  66492 1849.9
## - X24pxx133px         1       720  67263 1849.9
## + Unique.words        1        43  66501 1850.0
## + X50x45              1        27  66516 1850.0
## + Flesh_Mesaure       1        26  66518 1850.0
## + internal            1        21  66523 1850.1
## + facebook            1        17  66526 1850.1
## + Sentences           1        17  66526 1850.1
## + .tiff               1        15  66528 1850.1
## + .tif                1        14  66529 1850.1
## + autox100.           1         9  66534 1850.1
## + .jpe                1         5  66539 1850.2
## + .png                1         4  66539 1850.2
## + number_of_warning   1         1  66543 1850.2
## - X8x15               1      1047  67590 1851.6
## - .jpeg               1      1597  68140 1854.4
## - X400x300            1      1683  68227 1854.9
## - loading.time        1      1792  68335 1855.4
## - .bmp                1      3391  69934 1863.5
## - X60x60              1     60871 127414 2072.2
## 
## Step:  AIC=1847.51
## Revenues ~ X60x60 + X44x556 + X400x300 + .bmp + loading.time + 
##     .jpeg + X8x15 + X24pxx133px + Readability + instagram
## 
##                      Df Sum of Sq    RSS    AIC
## <none>                             66033 1847.5
## + number_of_errors    1       328  65705 1847.8
## + X29x29              1       324  65709 1847.8
## + X15x75              1       265  65768 1848.1
## - instagram           1       510  66543 1848.2
## + X115x223            1       245  65788 1848.2
## - X44x556             1       528  66561 1848.3
## + X292pxx292px        1       216  65817 1848.4
## + .gif                1       181  65852 1848.5
## + youtube             1       157  65877 1848.7
## + non.document.error  1       142  65891 1848.8
## + .jpg                1        81  65953 1849.1
## + .dib                1        76  65958 1849.1
## + X800x1200           1        74  65959 1849.1
## + Words               1        69  65964 1849.1
## + external            1        68  65965 1849.2
## + twitter             1        62  65971 1849.2
## + pinterest           1        38  65995 1849.3
## + Unique.words        1        36  65998 1849.3
## + autox100.           1        26  66008 1849.4
## + .tiff               1        25  66008 1849.4
## + Flesh_Mesaure       1        20  66013 1849.4
## + X50x45              1        18  66015 1849.4
## + facebook            1        18  66016 1849.4
## + .png                1        17  66016 1849.4
## + Sentences           1        17  66016 1849.4
## + internal            1        16  66017 1849.4
## + linkedin            1        12  66021 1849.4
## - X24pxx133px         1       754  66788 1849.5
## + .jpe                1         5  66028 1849.5
## + .tif                1         4  66030 1849.5
## + number_of_warning   1         3  66030 1849.5
## - Readability         1       846  66880 1849.9
## - X8x15               1      1208  67241 1851.8
## - .jpeg               1      1426  67459 1852.9
## - X400x300            1      1587  67620 1853.8
## - loading.time        1      1823  67856 1855.0
## - .bmp                1      3429  69462 1863.1
## - X60x60              1     61259 127292 2073.9
summary(model_a)
## 
## Call:
## lm(formula = Revenues ~ X60x60 + X44x556 + X400x300 + .bmp + 
##     loading.time + .jpeg + X8x15 + X24pxx133px + Readability + 
##     instagram, data = total_500_final_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -14.386  -8.578  -3.882   2.357  66.268 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  422.07586   20.00250  21.101  < 2e-16 ***
## X60x60       -88.85405    5.02526 -17.681  < 2e-16 ***
## X44x556      -25.93785   15.80312  -1.641  0.10167    
## X400x300     -27.14161    9.53846  -2.845  0.00471 ** 
## .bmp           2.53711    0.60649   4.183 3.67e-05 ***
## loading.time  -5.41135    1.77406  -3.050  0.00247 ** 
## .jpeg          0.03943    0.01462   2.698  0.00734 ** 
## X8x15        -43.04916   17.33589  -2.483  0.01351 *  
## X24pxx133px  -21.05205   10.72927  -1.962  0.05057 .  
## Readability    1.25782    0.60521   2.078  0.03844 *  
## instagram      2.95626    1.83217   1.614  0.10757    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14 on 337 degrees of freedom
## Multiple R-squared:  0.8115, Adjusted R-squared:  0.8059 
## F-statistic: 145.1 on 10 and 337 DF,  p-value: < 2.2e-16
ad_r_sq_ma <- summary(model_a)$adj.r.squared
aic_ma <- AIC(model_a)
#We create the 2 basic plots so as to be able to explain the regression model
plot(model_a,which=1:3)
## Warning: not plotting observations with leverage one:
##   3

################
#We compare the Adjusted R squares of the models and also the AIC of the models we created to find the best one
ad_r_sq_f3 
## [1] 0.797919
ad_r_sq_f4 
## [1] 0.7850981
ad_r_sq_ma
## [1] 0.8059286
#The best Adkusted R square is the one in full 3 (the closer to 1 the better)
aic_f3
## [1] 2850.194
aic_f4
## [1] 2869.653
aic_ma 
## [1] 2837.088
#The best AIC and the best Adjusted R square is for model ma
#######################################################################################################
par(mfrow=c(2,2))
Actual_Revenues<- total_500_final_test$Revenues
plot (Actual_Revenues, col = "blue")
###########################################
predictions_ma <- predict(model_a,total_500_final_test)
plot (predictions_ma, col = "Red",main = "Model a")
#####################################
predictions_full3 <- predict(full_3,total_500_final_test)
plot (predictions_full3, col = "Red",main = "Full_3 model")
#####################################
predictions_full4 <- predict(full_4,total_500_final_test)
plot (predictions_full4, col = "Red",main = "Full_4 model")

#####################################
#From the plots above we can see that the actual Revenues have a more smooth way of leveling up except from the Revenues of the #1 ranking company that are extremely high in relationship with the other sites.
#The prediction model that is more smooth is the model a which has as we said before the best Adjusted R Square and the best AIC price
par(mfrow=c(1,1))
total_500_final_reg <- total_500_final_train[,c(1,6,12,20,21,25,30,42,43,47,53)]
corrplot(cor(total_500_final_reg),method="number")

#We can see here that the variable x8x15 has a very high correlation with the variable x44x556 and also the variable x24pxx133px has also a very high correlation with the variable x400x300.
#So we can try creating a new model excluding the 2 variables that are correlated from each pair to see if there will be any improvement in the model
full_5 <- lm(Revenues~1 +X60x60 +X44x556 +X400x300 + .bmp +loading.time + .jpeg + Readability + instagram ,data=total_500_final_train)
summary(full_5)
## 
## Call:
## lm(formula = Revenues ~ 1 + X60x60 + X44x556 + X400x300 + .bmp + 
##     loading.time + .jpeg + Readability + instagram, data = total_500_final_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -27.944  -8.620  -4.054   2.701  65.967 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  394.88728   16.79045  23.519  < 2e-16 ***
## X60x60       -88.71740    5.08374 -17.451  < 2e-16 ***
## X44x556      -63.78453    9.77718  -6.524 2.50e-10 ***
## X400x300     -39.25886    7.36190  -5.333 1.77e-07 ***
## .bmp           2.53695    0.61357   4.135 4.48e-05 ***
## loading.time  -5.54201    1.79029  -3.096  0.00213 ** 
## .jpeg          0.03997    0.01479   2.703  0.00722 ** 
## Readability    1.04666    0.60852   1.720  0.08634 .  
## instagram      2.32848    1.84028   1.265  0.20664    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.16 on 339 degrees of freedom
## Multiple R-squared:  0.8059, Adjusted R-squared:  0.8014 
## F-statistic:   176 on 8 and 339 DF,  p-value: < 2.2e-16
adj_r_square_full5 <- summary(full_5)$adj.r.squared
aic_full5 <- AIC(full_5)
#We create the 2 basic plots so as to be able to explain the regression model
plot(full_5,which=1:3)

ad_r_sq_ma
## [1] 0.8059286
adj_r_square_full5 
## [1] 0.8013705
aic_ma 
## [1] 2837.088
aic_full5 
## [1] 2843.226
#The adjusted R square and the aic are a little worse than before
#######################################################################################################
##################################################################################################
#Clustering
#Based on those results we will try to cluster the companies based on the results of the regression
set.seed(220)
fortuneCluster <- kmeans(total_500_final_reg[, 1:11], 3, iter.max = 100,nstart = 1)
cluster <- table(fortuneCluster$cluster)
fortuneCluster$cluster <- as.factor(fortuneCluster$cluster)
ggplot(total_500_final_reg, aes(Revenues, loading.time, color = fortuneCluster$cluster)) + geom_point(size=3)

ggplot(total_500_final_reg, aes(Revenues, Readability, color = fortuneCluster$cluster)) + geom_point(size=3)

ggplot(total_500_final_reg, aes(Revenues, instagram, color = fortuneCluster$cluster)) + geom_point(size=3)

ggplot(total_500_final_reg, aes(Revenues, .bmp, color = fortuneCluster$cluster)) + geom_point(size=3)

ggplot(total_500_final_reg, aes(Revenues, .jpeg, color = fortuneCluster$cluster)) + geom_point(size=3)

ggplot(total_500_final_reg, aes(Revenues, X60x60, color = fortuneCluster$cluster)) + geom_point(size=3)

ggplot(total_500_final_reg, aes(Revenues, X44x556, color = fortuneCluster$cluster)) + geom_point(size=3)

ggplot(total_500_final_reg, aes(Revenues, X400x300, color = fortuneCluster$cluster)) + geom_point(size=3)

#From the clustering we can see that the variables do indeed devide the most high revenues from the smallest ones
summary(model_a)
## 
## Call:
## lm(formula = Revenues ~ X60x60 + X44x556 + X400x300 + .bmp + 
##     loading.time + .jpeg + X8x15 + X24pxx133px + Readability + 
##     instagram, data = total_500_final_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -14.386  -8.578  -3.882   2.357  66.268 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  422.07586   20.00250  21.101  < 2e-16 ***
## X60x60       -88.85405    5.02526 -17.681  < 2e-16 ***
## X44x556      -25.93785   15.80312  -1.641  0.10167    
## X400x300     -27.14161    9.53846  -2.845  0.00471 ** 
## .bmp           2.53711    0.60649   4.183 3.67e-05 ***
## loading.time  -5.41135    1.77406  -3.050  0.00247 ** 
## .jpeg          0.03943    0.01462   2.698  0.00734 ** 
## X8x15        -43.04916   17.33589  -2.483  0.01351 *  
## X24pxx133px  -21.05205   10.72927  -1.962  0.05057 .  
## Readability    1.25782    0.60521   2.078  0.03844 *  
## instagram      2.95626    1.83217   1.614  0.10757    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14 on 337 degrees of freedom
## Multiple R-squared:  0.8115, Adjusted R-squared:  0.8059 
## F-statistic: 145.1 on 10 and 337 DF,  p-value: < 2.2e-16
#We can see from the model that the basic variable that effect a companys ranking is whether or not it has an image in size X60x60
#We will try to make a model that we will not take into consideration this variable at all just in order to see how it will explain the revenues
full_6 <- lm(Revenues~1 +X44x556 +X400x300 + .bmp +loading.time + .jpeg + Readability + instagram ,data=total_500_final_train)
summary(full_6)
## 
## Call:
## lm(formula = Revenues ~ 1 + X44x556 + X400x300 + .bmp + loading.time + 
##     .jpeg + Readability + instagram, data = total_500_final_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -26.704 -10.830  -6.491   1.438  98.117 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   396.90230   23.09946  17.182  < 2e-16 ***
## X44x556       -63.96097   13.45126  -4.755 2.94e-06 ***
## X400x300     -126.11557    7.46301 -16.899  < 2e-16 ***
## .bmp            2.44616    0.84411   2.898   0.0040 ** 
## loading.time   -4.19747    2.46076  -1.706   0.0890 .  
## .jpeg           0.03687    0.02034   1.813   0.0707 .  
## Readability     0.42768    0.83576   0.512   0.6092    
## instagram       0.84978    2.52914   0.336   0.7371    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19.48 on 340 degrees of freedom
## Multiple R-squared:  0.6316, Adjusted R-squared:  0.624 
## F-statistic: 83.28 on 7 and 340 DF,  p-value: < 2.2e-16
adj_r_square_full6 <- summary(full_6)$adj.r.squared
aic_full6 <- AIC(full_6)
#We create the 2 basic plots so as to be able to explain the regression model
plot(full_6,which=1:3)

predictions_ma <- predict(model_a,total_500_final_test)
Actual_Revenues<- total_500_final_test$Revenues
par(mfrow=c(2,2))
plot (Actual_Revenues, col = "blue")
plot (predictions_ma, col = "Red",main = "Model A")
#####################################
predictions_full_6 <- predict(full_6,total_500_final_test)
plot (predictions_full_6, col = "Red",main = "Full_6 model")
#######################################################

#We can see that here the prediction of the new model is not as good as the previous one so now that we have checked this option as well we can conclude that the most important factors are the ones of model_a
summary(model_a)
## 
## Call:
## lm(formula = Revenues ~ X60x60 + X44x556 + X400x300 + .bmp + 
##     loading.time + .jpeg + X8x15 + X24pxx133px + Readability + 
##     instagram, data = total_500_final_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -14.386  -8.578  -3.882   2.357  66.268 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  422.07586   20.00250  21.101  < 2e-16 ***
## X60x60       -88.85405    5.02526 -17.681  < 2e-16 ***
## X44x556      -25.93785   15.80312  -1.641  0.10167    
## X400x300     -27.14161    9.53846  -2.845  0.00471 ** 
## .bmp           2.53711    0.60649   4.183 3.67e-05 ***
## loading.time  -5.41135    1.77406  -3.050  0.00247 ** 
## .jpeg          0.03943    0.01462   2.698  0.00734 ** 
## X8x15        -43.04916   17.33589  -2.483  0.01351 *  
## X24pxx133px  -21.05205   10.72927  -1.962  0.05057 .  
## Readability    1.25782    0.60521   2.078  0.03844 *  
## instagram      2.95626    1.83217   1.614  0.10757    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14 on 337 degrees of freedom
## Multiple R-squared:  0.8115, Adjusted R-squared:  0.8059 
## F-statistic: 145.1 on 10 and 337 DF,  p-value: < 2.2e-16